# Replication entrypoint for regression outputs only.
# Saves tidy regression coefficient tables (no row-level microdata exports).

library(here)
library(tidyverse)
library(janitor)
library(estimatr)
library(broom)

microdata_file <- here::here("microdata.Rdata")
if (!file.exists(microdata_file)) {
  stop("Missing microdata.Rdata. Run preamble.R once to generate it.")
}
load(microdata_file)

results_dir <- here::here("regressions")
if (!dir.exists(results_dir)) {
  dir.create(results_dir, recursive = TRUE)
}

save_regression_output <- function(df, stem) {
  readr::write_csv(df, file.path(results_dir, paste0(stem, ".csv")))
  saveRDS(df, file.path(results_dir, paste0(stem, ".rds")))
}

message("Building shared analysis frames...")

coded_qual_pre_post_resolved_with_covariates <- bind_rows(
  coded_qual_resolved %>%
    mutate(after_probe = treat == "Elaboration", coding_round = 1) %>%
    left_join(
      combined %>%
        select(-matches("(issue|occu|news|econ)"), matches("probe_flag")),
      by = "respondent_id"
    ),
  coded_qual_pre_resolved %>%
    mutate(after_probe = FALSE, coding_round = 2) %>%
    left_join(
      combined %>%
        select(-matches("(issue|occu|news|econ)"), matches("probe_flag")),
      by = "respondent_id"
    )
)

nlp_measures_l <- nlp_measures %>%
  mutate(response = case_when(
    treatment == "Control" & grepl("econd_reason_1", response) ~ "econd_last_response",
    TRUE ~ response
  )) %>%
  mutate(exp = gsub("_(combined|last)_response", "", response) %>%
    exp_ordered_labels_f()) %>%
  pivot_longer(
    names_to = "measure",
    values_to = "val",
    unique_words:shannon_entropy
  )

nlp_measures_l_with_covariates <- nlp_measures_l %>%
  left_join(
    combined %>%
      select(-matches("(issue|occu|news|econ)"), matches("probe_flag")),
    by = c("treatment", "respondent_id")
  )

message("Running coded-quality models (main + appendix)...")

coded_qual_all_results <- map_dfr(
  quality_criteria,
  function(out) {
    reg_subsets <- list(
      "Most Imp.\nIssue" = "issue",
      "Econ. Cond.\n(Reason)" = "econd",
      "Pooled" = c("issue", "econd")
    )
    lapply(seq_along(reg_subsets), function(k) {
      bind_rows(
        coded_qual_pre_post_resolved_with_covariates %>%
          rename_at(out, ~ "y") %>%
          filter(q %in% reg_subsets[[k]]) %>%
          mutate(y = as.numeric(y)) %>%
          filter((treat == "Elaboration" & after_probe) |
            (treat == "Control" & !after_probe)) %>%
          lm(y ~ treat, data = .) %>%
          tidy() %>%
          mutate(type = "No Controls", design = "Diff-in-Means"),
        coded_qual_pre_post_resolved_with_covariates %>%
          rename_at(out, ~ "y") %>%
          filter(q %in% reg_subsets[[k]]) %>%
          mutate(y = as.numeric(y)) %>%
          filter((treat == "Elaboration" & after_probe) |
            (treat == "Control" & !after_probe)) %>%
          filter(!is.na(after_probe)) %>%
          lm(y ~ treat + work_status + gender + hh_inc5 + educ5 + log(age + 1) + device_type, data = .) %>%
          tidy() %>%
          mutate(type = "Controls", design = "Diff-in-Means"),
        coded_qual_pre_post_resolved_with_covariates %>%
          rename_at(out, ~ "y") %>%
          filter(q %in% reg_subsets[[k]]) %>%
          mutate(y = as.numeric(y)) %>%
          filter(!is.na(after_probe)) %>%
          lm(y ~ after_probe, data = .) %>%
          tidy() %>%
          mutate(type = "No Controls", design = "Pre-Post"),
        coded_qual_pre_post_resolved_with_covariates %>%
          rename_at(out, ~ "y") %>%
          filter(q %in% reg_subsets[[k]]) %>%
          mutate(y = as.numeric(y)) %>%
          filter(!is.na(after_probe)) %>%
          lm(y ~ after_probe + work_status + gender + hh_inc5 + educ5 + log(age + 1) + device_type, data = .) %>%
          tidy() %>%
          mutate(type = "Controls", design = "Pre-Post")
      ) %>%
        mutate(
          outcome = out %>%
            stringr::str_replace_all("_", " ") %>%
            stringr::str_to_title(),
          subset = names(reg_subsets)[[k]]
        )
    })
  }
)
save_regression_output(coded_qual_all_results, "coded_quality_all_results")

message("Running NLP models for main manuscript...")

nlp_measure_names_main <- nlp_measure_names[1:5]

nlp_results_main <- map_dfr(
  seq_along(names(nlp_measure_names_main)),
  function(k) {
    map_dfr(
      c("issue", "econd", "news", "occu"),
      function(e) {
        d_1 <- nlp_measures_l_with_covariates %>%
          filter(!(grepl("(News|Occu)", exp) & grepl("Control", treatment))) %>%
          filter(!(grepl("(Issue|Econ)", exp) & grepl("Conf", treatment))) %>%
          filter(grepl("last_response", response) & !(grepl("why_frustrated", response))) %>%
          filter(measure == names(nlp_measure_names_main)[k] &
            exp == exp_ordered_labels[[e]]) %>%
          rename(y = val) %>%
          filter(!is.na(y)) %>%
          mutate(y = (y - mean(y)) / sd(y))

        d_2 <- nlp_measures_l_with_covariates %>%
          filter(!(grepl("(News|Occu)", exp) & grepl("Control", treatment))) %>%
          filter(!(grepl("(Issue|Econ)", exp) & grepl("Conf", treatment))) %>%
          mutate(response = case_when(
            treatment == "Control" & grepl("econd_last", response) ~ "econd_combined_response",
            TRUE ~ response
          )) %>%
          filter(grepl("combined_response", response) & !(grepl("why_frustrated", response))) %>%
          filter(measure == names(nlp_measure_names_main)[k] &
            exp == exp_ordered_labels[[e]]) %>%
          rename(y = val) %>%
          filter(!is.na(y)) %>%
          mutate(y = (y - mean(y)) / sd(y))

        if (e == "occu") {
          f <- y ~ treatment + hh_inc5 + gender + educ5 + log(age + 1) + device_type
        } else {
          f <- y ~ treatment + work_status + hh_inc5 + gender + educ5 + log(age + 1) + device_type
        }

        bind_rows(
          d_1 %>%
            lm(y ~ treatment, data = .) %>%
            tidy(conf.level = 0.95, conf.int = TRUE) %>%
            mutate(outcome = "Post Probe", specif = "No Controls"),
          d_2 %>%
            lm(y ~ treatment, data = .) %>%
            tidy(conf.level = 0.95, conf.int = TRUE) %>%
            mutate(outcome = "Combined", specif = "No Controls"),
          d_1 %>%
            lm(f, data = .) %>%
            tidy(conf.level = 0.95, conf.int = TRUE) %>%
            mutate(outcome = "Post Probe", specif = "Controls"),
          d_2 %>%
            lm(f, data = .) %>%
            tidy(conf.level = 0.95, conf.int = TRUE) %>%
            mutate(outcome = "Combined", specif = "Controls")
        ) %>%
          mutate(
            measure = nlp_measure_names_main[[k]] %>%
              stringr::str_wrap(width = 10),
            exp = e
          )
      }
    )
  }
)
save_regression_output(nlp_results_main, "main_nlp_results")

message("Running attrition models for main manuscript...")

nr_types <- list(
  "Dropout\n(Right After)" = "_dropout_rightafter",
  "Dropout\n(Any Time After)" = "_dropout_after"
)

nr_results_main <- map_dfr(
  seq_along(nr_types),
  function(k) {
    map_dfr(
      c("issue", "econd_reason", "news", "occu"),
      function(exp) {
        d <- combined %>%
          rename_at(paste0(exp, nr_types[[k]]), ~ "y")

        if (exp == "occu") {
          f <- y ~ treatment + hh_inc5 + gender + educ5 + age5 + device_type
        } else {
          f <- y ~ treatment + work_status + hh_inc5 + gender + educ5 + age5 + device_type
        }

        bind_rows(
          d %>%
            lm(y ~ treatment, data = .) %>%
            tidy() %>%
            mutate(type = "No Controls"),
          d %>%
            lm(f, data = .) %>%
            tidy() %>%
            mutate(type = "Controls")
        ) %>%
          mutate(
            nr_type = names(nr_types)[k],
            exp = exp
          )
      }
    )
  }
)
save_regression_output(nr_results_main, "main_attrition_results")

message("Running user-experience models for main manuscript...")

ux_results_main <- map_dfr(
  c("quality", "ease", "frustration", "satisfaction"),
  function(out) {
    bind_rows(
      combined %>%
        rename_at(out, ~ "y") %>%
        mutate(y = as.numeric(y)) %>%
        mutate(y = y / max(y, na.rm = TRUE)) %>%
        lm_robust(y ~ treatment, data = .) %>%
        tidy() %>%
        mutate(type = "No Controls"),
      combined %>%
        rename_at(out, ~ "y") %>%
        mutate(y = as.numeric(y)) %>%
        mutate(y = y / max(y, na.rm = TRUE)) %>%
        lm_robust(y ~ treatment + work_status + gender + hh_inc5 + educ5 + log(age + 1) + device_type, data = .) %>%
        tidy() %>%
        mutate(type = "Controls")
    ) %>%
      mutate(outcome = out %>%
        stringr::str_replace_all("_", " ") %>%
        stringr::str_to_title())
  }
) %>%
  mutate(outcome = as_factor(outcome), type = as_factor(type)) %>%
  adj_bhq(alpha = global_stat_sig_level)

save_regression_output(ux_results_main, "main_user_experience_results")

message("Running heterogeneity models for appendix...")

coded_qual_resolved_with_covariates <- coded_qual_resolved %>%
  left_join(
    combined %>%
      select(-matches("(issue|occu|news|econ)"), matches("probe_flag")),
    by = "respondent_id"
  )

coded_qual_hetfx_results <- map_dfr(
  quality_criteria,
  function(out) {
    reg_subsets <- list(
      "Most Imp. Issue" = "issue",
      "Econ. Cond. (Reason)" = "econd",
      "Both" = c("issue", "econd")
    )
    map_dfr(
      seq_along(reg_subsets),
      function(k) {
        purrr::map_dfr(
          c("work_status", "gender", "educ5", "age5", "device_type", "hh_inc5"),
          function(var) {
            purrr::map_dfr(
              unique(sort(combined[[var]])) %>%
                na.omit() %>%
                str_filter("Tablet") %>%
                str_filter("(Other|Refused|IDK)"),
              function(val) {
                d <- coded_qual_resolved_with_covariates %>%
                  filter_at(var, ~ .x == val) %>%
                  rename_at(out, ~ "y") %>%
                  filter(q %in% reg_subsets[[k]]) %>%
                  mutate(y = as.numeric(y)) %>%
                  filter(!is.na(y)) %>%
                  filter(treatment %in% c("Control", "Elab/Rel."))

                d %>%
                  lm(y ~ treatment, data = .) %>%
                  tidy() %>%
                  mutate(type = "No Controls") %>%
                  mutate(
                    outcome = out %>%
                      stringr::str_replace_all("_", " ") %>%
                      stringr::str_to_title(),
                    subset = names(reg_subsets)[[k]],
                    val = val,
                    var = var_labels[[var]],
                    outcome_variation = length(unique(d$y)) > 1,
                    estimate = ifelse(!outcome_variation, NA, estimate),
                    p.value = ifelse(!outcome_variation, NA, p.value),
                    std.error = ifelse(!outcome_variation, NA, std.error)
                  )
              }
            )
          }
        )
      }
    )
  }
)
save_regression_output(coded_qual_hetfx_results, "appendix_hetfx_coded_quality_results")

nlp_measure_names_appendix <- c(
  "unique_words" = "Unique Words",
  "total_words" = "Total Words",
  nlp_measure_names[c(3, 4, 5)]
)

nlp_hetfx_results <- map_dfr(
  seq_along(names(nlp_measure_names_appendix)),
  function(k) {
    map_dfr(
      c("issue", "news", "occu"),
      function(e) {
        if (e == "occu") {
          vars <- c("gender", "educ5", "age5", "device_type", "hh_inc5")
        } else {
          vars <- c("work_status", "gender", "educ5", "age5", "device_type", "hh_inc5")
        }

        map_dfr(
          vars,
          function(var) {
            map_dfr(
              unique(sort(combined[[var]])) %>%
                na.omit() %>%
                str_filter("Tablet") %>%
                str_filter("(Other|Refused|IDK)"),
              function(val) {
                d <- nlp_measures_l_with_covariates %>%
                  dplyr::filter_at(var, ~ .x == val)

                d_1 <- d %>%
                  filter(!(grepl("(News|Occu)", exp) & grepl("Control", treatment))) %>%
                  filter(!(grepl("(Issue|Econ)", exp) & grepl("Conf", treatment))) %>%
                  filter(grepl("last_response", response) & !(grepl("why_frustrated", response))) %>%
                  filter(measure == names(nlp_measure_names_appendix)[k] &
                    exp == exp_ordered_labels[[e]]) %>%
                  rename(y = val) %>%
                  filter(!is.na(y)) %>%
                  mutate(y = (y - mean(y, na.rm = TRUE) / sd(y, na.rm = TRUE)))

                d_2 <- d %>%
                  filter(!(grepl("(News|Occu)", exp) & grepl("Control", treatment))) %>%
                  filter(!(grepl("(Issue|Econ)", exp) & grepl("Conf", treatment))) %>%
                  mutate(response = case_when(
                    treatment == "Control" & grepl("econd_last", response) ~ "econd_combined_response",
                    TRUE ~ response
                  )) %>%
                  filter(grepl("combined_response", response) & !(grepl("why_frustrated", response))) %>%
                  filter(measure == names(nlp_measure_names_appendix)[k] &
                    exp == exp_ordered_labels[[e]]) %>%
                  rename(y = val) %>%
                  filter(!is.na(y)) %>%
                  mutate(y = (y - mean(y, na.rm = TRUE) / sd(y, na.rm = TRUE)))

                bind_rows(
                  if (nrow(d_1) > 1 && all(is.finite(d_1$y))) {
                    d_1 %>%
                      lm(y ~ treatment, data = .) %>%
                      tidy(conf.level = 0.95, conf.int = TRUE) %>%
                      mutate(
                        type = "Post Probe",
                        outcome_variation = length(unique(d_1$y)) > 1
                      )
                  } else {
                    data.frame()
                  },
                  if (nrow(d_2) > 1 && all(is.finite(d_2$y))) {
                    d_2 %>%
                      lm(y ~ treatment, data = .) %>%
                      tidy(conf.level = 0.95, conf.int = TRUE) %>%
                      mutate(
                        type = "Combined",
                        outcome_variation = length(unique(d_2$y)) > 1
                      )
                  } else {
                    data.frame()
                  }
                ) %>%
                  mutate(
                    measure = nlp_measure_names_appendix[[k]] %>%
                      stringr::str_wrap(width = 10),
                    val = val,
                    var = var_labels[[var]],
                    exp = e,
                    estimate = ifelse(!outcome_variation, NA, estimate),
                    p.value = ifelse(!outcome_variation, NA, p.value),
                    std.error = ifelse(!outcome_variation, NA, std.error)
                  )
              }
            )
          }
        )
      }
    )
  }
)
save_regression_output(nlp_hetfx_results, "appendix_hetfx_nlp_results")

ux_hetfx_results <- map_dfr(
  c("quality", "ease", "frustration", "satisfaction"),
  function(out) {
    map_dfr(
      c("work_status", "gender", "educ5", "age5", "device_type", "hh_inc5"),
      function(var) {
        map_dfr(
          unique(sort(combined[[var]])) %>%
            na.omit() %>%
            str_filter("Tablet") %>%
            str_filter("(Other|Refused|IDK)"),
          function(val) {
            combined %>%
              filter(treatment %in% c("Control", "Elab/Rel.")) %>%
              rename_at(out, ~ "y") %>%
              mutate(y = as.numeric(y)) %>%
              filter(!is.na(y)) %>%
              mutate(y = y / max(y, na.rm = TRUE)) %>%
              filter_at(var, ~ .x == val) %>%
              lm(y ~ treatment, data = .) %>%
              tidy() %>%
              mutate(
                val = val,
                var = var_labels[[var]],
                outcome = out
              )
          }
        )
      }
    ) %>%
      adj_bhq(alpha = global_stat_sig_level)
  }
) %>%
  mutate_at(
    c("outcome", "var"),
    ~ .x %>%
      stringr::str_replace("_", " ") %>%
      stringr::str_replace("age5", "age") %>%
      stringr::str_replace("work status", "employed") %>%
      stringr::str_replace("type", "")
  ) %>%
  mutate(
    val = as_factor(val),
    outcome = stringr::str_to_title(outcome)
  )

save_regression_output(ux_hetfx_results, "appendix_hetfx_user_experience_results")

all_regression_results <- list(
  coded_qual_all_results = coded_qual_all_results,
  nlp_results_main = nlp_results_main,
  nr_results_main = nr_results_main,
  ux_results_main = ux_results_main,
  coded_qual_hetfx_results = coded_qual_hetfx_results,
  nlp_hetfx_results = nlp_hetfx_results,
  ux_hetfx_results = ux_hetfx_results
)

saveRDS(all_regression_results, here::here("results.rds"))

message("DONE. Regression outputs saved to: ", results_dir)
